Search for variables mapbox_access_token and GOOGLE_API_KEY below, uncomment their lines and input your keys. These keys are free to attain.
import pandas as pd
import numpy as np
import os
from math import pow
from math import log
import random as rd
from scipy.stats.kde import gaussian_kde
import colorcet as cc
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML
import plotly.graph_objs as go
from bokeh import plotting as bplt
from bokeh.plotting import gmap #, figure
from bokeh.io import output_notebook, show, output_file, push_notebook
from bokeh.models import NumeralTickFormatter, ColumnDataSource, HoverTool, CustomJS, Slider, GMapOptions, Toggle, LinearAxis, Range1d
from bokeh.models.callbacks import CustomJS
from bokeh.layouts import row, column
from bokeh.palettes import Category20_16, Category20_20
from bokeh.models.widgets import CheckboxGroup, Slider, Dropdown
import keras
from keras import metrics
from keras import regularizers
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Activation, Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
Download and Describe
df=pd.read_csv('kc_house_data.csv')
df.head()
df.isnull().values.any()
df.isna().values.any()
df.describe()
Print Ranges of House Prices
for i in range(10):
a = df['price'][df['price']<100000*i].shape[0]
b = df['price'][df['price']<100000*(i+1)].shape[0]
print("Houses Sold for %s-%sk: %s" %(i*100,(i+1)*100,b-a))
for i in range(1,8):
a = df['price'][df['price']<1000000*i].shape[0]
b = df['price'][df['price']<1000000*(i+1)].shape[0]
print("Houses Sold for %s-%sm: %s" %(i,i+1,b-a))
Print Ranges of Lot Area
df['sqft_lot15']
for i in range(10):
a = df['sqft_lot15'][df['sqft_lot15']<1000*i].shape[0]
b = df['sqft_lot15'][df['sqft_lot15']<1000*(i+1)].shape[0]
print("Houses with %s-%sk square feet: %s" %(i,(i+1),b-a))
i=1
a = df['sqft_lot15'][df['sqft_lot15']<10000*i].shape[0]
b = df['sqft_lot15'][df['sqft_lot15']<10000*(i+1)].shape[0]
print("Houses with %s-%sk square feet: %s" %(i*10,(i+1)*10,b-a))
print("Houses with >20k square feet: %s" %(df['sqft_lot15'][df['sqft_lot15']>=20000].shape[0]))
Print Ranges of Living Space
print("Houses with less than 1k square feet of living space: %s" %(df['sqft_living15'][df['sqft_living15']<1000].shape[0]))
for i in range(0,8):
a = df['sqft_lot'][df['sqft_living15']<250*i+1000].shape[0]
b = df['sqft_lot'][df['sqft_living15']<250*(i+1)+1000].shape[0]
print("Houses with %s-%s square feet of living space: %s" %(250*i+1000,(i+1)*250+1000,b-a))
print("Houses with more than 3k square feet of living space: %s" %(df['sqft_living15'][df['sqft_living15']>=3000].shape[0]))
Get Price Names
k = round(df['price']/1000,0).astype(int)
df['price_names'] = '$' + k.astype(str) + 'k'
for i in range(df['price_names'].shape[0]):
if len(df['price_names'][i])>5:
num = round(int(df['price_names'][i][1:(len(df['price_names'][i])-1)])/1000,1)
df.loc[i,'price_names'] = '$' + str(num) + 'm'
Get Months
months = ['May 14','Jun 14','Jul 14','Aug 14','Sep 14','Oct 14','Nov 14','Dec 14','Jan 15','Feb 15','Mar 15','Apr 15','May 15']
months_in_date_form = ['201405','201406','201407','201408','201409','201410','201411','201412','201501','201502','201503','201504','201505']
date = dict(zip(months_in_date_form,months))
df['month'] = df['date']
for i in range(df.date.shape[0]):
df.loc[i,'month'] = date[df.loc[i,'date'][:6]]
Get Year Built or Renovated, whichever is more recent
df['max_yr_built_or_renovated'] = df['yr_built']
index = df[df['yr_renovated']>0].index.values
for i in index:
df.loc[i,'max_yr_built_or_renovated'] = max(df.loc[i,'yr_built'],df.loc[i,'yr_renovated'])
init_notebook_mode(connected=True)
bins = [0,300000,500000,700000,8000000]
df['binned'] = pd.cut(df['price'], bins).astype(str).tolist()
subset = df[['lat','long','price','binned','price_names','month']]
subs={}
for t in range(df['binned'].unique().shape[0]):
subs["s"+str(t)+""]=subset[subset['binned']==np.sort(df['binned'].unique())[t]]
#mapbox_access_token = '*******************************************************************************************'
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
labels = ["< $0.3m","$0.3-0.5m","$0.5-0.7m","> $0.7m" ]
traces = []
for i in range(4):
x = subs['s'+str(i)]
col = colours[i]
traces.append(go.Scattermapbox(
lat=x['lat'].astype(str).tolist(),
lon=x['long'].astype(str).tolist(),
mode='markers',
marker = dict(
size = 4,
color = col,
),
text=x['price_names'].tolist(),
hoverinfo = 'text',
name=labels[i]
))
data = traces
layout = go.Layout(
autosize=True,
hovermode='closest',
title="2014-15 Sold House Prices in King County",
showlegend=True,
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=np.median(df['lat']),
lon=np.median(df['long'])
),
pitch=0,
zoom=9
),
)
fig = dict(data=data, layout=layout)
iplot(fig, filename='Seattle Prices by Colour')
Hover mouse over location for price tag and zoom in for further inspection.
value=0
traces = []
for i in range(4):
x = subs['s'+str(i)][subs['s'+str(i)]['month']==months[value]]
col = colours[i]
traces.append(go.Scattermapbox(
lat=x['lat'].astype(str).tolist(),
lon=x['long'].astype(str).tolist(),
mode='markers',
marker = dict(
size = 7,
color = col,
),
text=x['price_names'].tolist(),
hoverinfo = 'text',
name=labels[i]
))
data = traces
sliders_dict = {
'active': 0,
'yanchor': 'top',
'xanchor': 'left',
'currentvalue': {
'font': {'size': 20},
'prefix': 'Month:',
'visible': True,
'xanchor': 'right'
},
'transition': {'duration': 300, 'easing': 'cubic-in-out'},
'pad': {'b': 10, 't': 50},
'len': 0.9,
'x': 0.1,
'y': 0,
'steps': []
}
layout = go.Layout(
autosize=True,
height = 500,
margin = {'l': 20, 'b': 30, 'r':10, 't': 100},
hovermode='closest',
title="Sold House Prices in King County",
showlegend=True,
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=np.median(df['lat']),
lon=np.median(df['long'])
),
pitch=0,
zoom=9
),
sliders = [sliders_dict
# args = ['transition', dict(duration=400,easing='cubic-in-out')],
# initialValue='May 14',
# plotlycommand='animate',
# values = months,
# visible=True
],
updatemenus = [dict(
buttons = [
dict(
args= [None,dict(frame=dict(duration=500,redraw=False),fromcurrent=True,
transition=dict(duration=300,easing='quadratic-in-out'))],
label='Play',
method='animate'
),
dict(
args= [None,dict(frame=dict(duration=0,redraw=False),mode='immediate',
transition=dict(duration=0))],
label='Pause',
method='animate'
)],
direction = 'left',
pad = {'r': 10, 't': 87},
showactive = False,
type = 'buttons',
x = 0.1,
xanchor = 'right',
y = 0,
yanchor = 'top'
)]
)
figure = dict(data=data, layout=layout)
figure['frames'] = []
# make frames
for value in range(len(months)):
frame = {'data': [], 'name': months[value]}
for i in range(4):
x = subs['s'+str(i)][subs['s'+str(i)]['month']==months[value]]
col = colours[i]
frame['data'].append(go.Scattermapbox(
lat=x['lat'].astype(str).tolist(),
lon=x['long'].astype(str).tolist(),
mode='markers',
marker = dict(
size = 7,
color = col,
),
text=x['price_names'].tolist(),
hoverinfo = 'text',
name=labels[i]
))
figure['frames'].append(frame)
slider_step = {'args': [
[months[value]],
{'frame': {'duration': 300, 'redraw': False},
'mode': 'immediate',
'transition': {'duration': 300}}
],
'label': months[value],
'method': 'animate'}
sliders_dict['steps'].append(slider_step)
figure['layout']['sliders'] = [sliders_dict]
iplot(figure)
Play through months to see how prices change in various locations over time. Or select a month from the slider.
dataPoints = go.Scattergl(
x=df['sqft_living15'],
y=np.log(df['price']),
mode='markers',
marker=dict(
opacity=0.75,
color=df.grade,
showscale=True,
colorscale='Jet',
colorbar=dict(
title='Grade'
),
),
name='Data points'
)
data=[dataPoints]
layout = go.Layout(
title='Log Price vs Living Space',
xaxis=dict(
title='Living Space (Square Feet)'
),
yaxis=dict(
title='Log(House Price)'
),
#showlegend=True
)
figure = go.Figure(data=data, layout=layout)
iplot(figure)
temp_df = df[df['price']<1000000]
data = [
{
'x': temp_df['max_yr_built_or_renovated'],
'y': temp_df['sqft_living15'],
'mode': 'markers',
'marker': {
'color': temp_df['price'],
'showscale': True,
'colorbar': {'title':'Price'}
},
'name':'Price'
}
]
layout = go.Layout(
title = "Price (Capped at $1m) vs Square Feet of Living Space and Year Built or Renovated",
yaxis=dict(title='Living Space (Square Feet)'),
xaxis=dict(title='Year Built or Renovated')
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter-colorscale')
monthly_median_prices = []
monthly_volume = []
for m in range(len(months)):
temp_df = df[df['month']==months[m]]
monthly_median_prices.append(int(np.median(temp_df['price'])))
monthly_volume.append(temp_df.shape[0])
trace1 = go.Scatter(
x=months,
y=monthly_median_prices,
name='Median Price'
)
trace2 = go.Scatter(
x=months,
y=monthly_volume,
name='Sales',
yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
title='2014-15 King County Real Estate Market Activity',
yaxis=dict(
title='Median Price'
),
yaxis2=dict(
title='Sales',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='multiple-axes-double')
Prices seem to track volume. A greater amount of houses sold indicates greater demand, and hence inflated prices.
bounds = [0]+[1000]+[1000+i*250 for i in range(1,9)] + [10000]
df['sqft_liv15_binned'] = pd.cut(df['sqft_living15'], bounds).astype(str).tolist()
bins = np.sort(df['sqft_liv15_binned'].unique())
median_prices = []
median_bedrooms = []
mid_bounds = []
for j in range(len(bounds)-1):
median_prices.append(np.median(df['price'][df['sqft_liv15_binned']==bins[j]]))
median_bedrooms.append(np.median(df['bedrooms'][df['sqft_liv15_binned']==bins[j]]))
mid_bounds.append(round((bounds[j]+bounds[j+1])/2,0))
trace1 = go.Scatter(
x=bounds,
y=median_prices,
name='Median Price'
)
trace2 = go.Scatter(
x=bounds,
y=median_bedrooms,
name='Median Bedroom',
yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
title='Living Space vs Price & Bedrooms ',
xaxis=dict(title='Living Space (Square Feet)'),
yaxis=dict(
title='Median Price'
),
yaxis2=dict(
title='Median Bedrooms',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='binned_sqft')
variable = 'view'
def violin_plot(variable):
temp_df = df[df['price']<1000000]
data = []
for i in range(0,len(pd.unique(df[variable]))):
trace = {
"type": 'violin',
"x": temp_df[variable][temp_df[variable] == np.sort(temp_df[variable].unique())[i]],
"y": temp_df['price'][temp_df[variable] == np.sort(temp_df[variable].unique())[i]],
"name": np.sort(temp_df[variable].unique())[i],
"box": {
"visible": True
},
"meanline": {
"visible": True
}
}
data.append(trace)
fig = {
"data": data,
"layout" : {
"title": "Violin Plot for Price (Capped at $1m) vs " + variable.title(),
"yaxis": {
"zeroline": False,
"title": "Price ($)"
},
"xaxis": {
"title": variable.title()
}
}
}
return fig
iplot(violin_plot('view'), filename='violin', validate = False)
# price, bedrooms, floors, yr_built
wf = []
not_wf = []
wf_df = df[df['waterfront']==1]
not_wf_df = df[df['waterfront']==0]
variables = ['price','bedrooms','grade','sqft_lot15','sqft_living15','condition']
for i in variables:
temp_val = np.median(df[i])
wf.append(round(np.median(wf_df[i])/temp_val,2))
not_wf.append(round(np.median(not_wf_df[i])/temp_val,2))
trace1 = go.Scatterpolar(
r = wf,
theta = variables,
fill = 'toself',
name = "Waterfront"
)
trace2 = go.Scatterpolar(
r = not_wf,
theta = variables,
fill = 'toself',
name = "Not Waterfront"
)
data = [trace1,trace2]
layout = go.Layout(
title = "How Does the Median Waterfront House Compare to the Median House Sold?<br>Median House Has A Value of 1 For Each Attribute",
polar = dict(
radialaxis = dict(
visible = True,
range = [0, 4],
tick0=0,
dtick=1
)
),
showlegend = True
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "radar")
So houses on the waterfront tend to have 3x the price of non-waterfront houses and twice the sqft, but no difference in number of bedrooms or grade or condition.
output_notebook()
# Styling for a plot
def style(p):
# Title
p.title.align = 'center'
p.title.text_font_size = '20pt'
p.title.text_font = 'serif'
# Axis titles
p.xaxis.axis_label_text_font_size = '14pt'
p.xaxis.axis_label_text_font_style = 'bold'
p.yaxis.axis_label_text_font_size = '14pt'
p.yaxis.axis_label_text_font_style = 'bold'
# Tick labels
p.xaxis.major_label_text_font_size = '12pt'
p.yaxis.major_label_text_font_size = '12pt'
return p
""""""
temp_df = df['price']/1000
hist, edges = np.histogram(temp_df,
bins = 40,
range = [0, 1500])
# Put the information in a dataframe
prices = pd.DataFrame({'houses': hist,
'left': edges[:-1],
'right': edges[1:]})
# Add a column showing the extent of each interval
prices['interval'] = ['%s - %s' % (int(round(left,0)), int(round(right,0))) for left, right in zip(prices['left'], prices['right'])]
# Convert dataframe to column data source
src = ColumnDataSource(prices)
# Create the blank plot
p = bplt.figure(plot_height = 600, plot_width = 600,
title = 'Histogram of House Prices (Capped at $1.5m)',
x_axis_label = 'Price (in thousands of dollars)',
y_axis_label = 'Number of Houses')
# Add a quad glyph
p.quad(source = src, bottom=0, top='houses', left='left', right='right', fill_color='green', line_color='black',
hover_fill_alpha = 1.0, hover_fill_color = 'red')
# Hover tool referring to our own data field using @ and
# a position on the graph using $
h = HoverTool(tooltips = [('Price Range ($k)', '@interval'),('Number of Houses Sold', '@houses')])
# Add the hover tool to the graph
p.add_tools(h)
p.xaxis.formatter=NumeralTickFormatter(format="00")
style(p)
# Specify the output file and save
#output_file('hist.html')
# Show the plot
show(p)
Hover over for information about bar.
def make_dataset(value_list=np.sort(df['view'].unique()).tolist(),
range_start = 0, range_end = 1500, bin_width = 200, value='view'):
# Check to make sure the start is less than the end!
assert range_start < range_end, "Start must be less than end!"
by_value = pd.DataFrame(columns=['proportion', 'left', 'right',
'h_proportion', 'interval',
'name', 'color'])
range_extent = range_end - range_start
# Ignore dividing by 0
#np.seterr(divide='ignore', invalid='ignore')
# Iterate through all the carriers
for i, value_name in enumerate(value_list):
# Subset to the carrier
subset = df['price'][df[value] == value_name]/1000
# Create a histogram with specified bins and range
hist, edges = np.histogram(subset, bins = int(range_extent / bin_width), range = [range_start, range_end])
# Divide the counts by the total to get a proportion and create df
prices = pd.DataFrame({'proportion': hist / np.sum(hist),
'left': edges[:-1], 'right': edges[1:] })
# Format the proportion
prices['h_proportion'] = ['%0.5f' % proportion for proportion in prices['proportion']]
# Format the interval
prices['interval'] = ['%s - %s' % (int(round(left,0)), int(round(right,0))) for left, right in zip(prices['left'], prices['right'])]
# Assign the carrier for labels
prices['name'] = value_name
# Color each carrier differently
prices['color'] = Category20_16[i]
# Add to the overall dataframe
by_value = by_value.append(prices)
# Overall dataframe
by_value = by_value.sort_values(['name', 'left'])
# Convert dataframe to column data source
return ColumnDataSource(by_value) #by_value.to_dict('list')
def make_plot(src, value):
# Blank plot with correct labels
p = bplt.figure(plot_width = 700, plot_height = 700,
title = 'Histogram of House Prices by '+value.title(),
x_axis_label = 'Price ($K)', y_axis_label = 'Proportion')
# Quad glyphs to create a histogram
p.quad(source = src, bottom = 0, top = 'proportion', left = 'left', right = 'right',
color = 'color', fill_alpha = 0.7, hover_fill_color = 'color', legend = 'name',
hover_fill_alpha = 1.0, line_color = 'black')
# Hover tool with vline mode
h = HoverTool(tooltips=[(value.title(), '@name'),
('Price Range ($k)', '@interval'),
('Proportion', '@h_proportion')],
mode='vline')
p.add_tools(h)
# Styling
style(p)
return p
# variable='view'
# variable_list=np.sort(df[variable].unique()).tolist()
# src = make_dataset(value_list=variable_list,range_start = 0, range_end = 1500, bin_width = 50, value=variable)
# p = make_plot(src, variable)
# show(p)
def graph(source, colour):
return p.quad(source = source, bottom = 0, top = 'proportion', left = 'left', right = 'right',
color= colour, fill_alpha = 0.7, hover_fill_color = 'color', legend = 'name',
hover_fill_alpha = 1.0, line_color = 'black')
callback1 = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
var kernel = IPython.notebook.kernel;
cmd = "update_bw(" + cb_obj.value + ")";
kernel.execute(cmd, {}, {});
}
""")
def hist_start(title):
plt = bplt.figure(plot_width = 700, plot_height = 500,
title = 'Histogram of House Prices by '+title.title(),
x_axis_label = 'Price ($K)', y_axis_label = 'Proportion')
style(plt)
return plt
variable = 'view'
p = hist_start(variable)
variable_list = np.sort(df[variable].unique()).tolist()
args = []
code = "active = cb_obj.active;"
sources = []
rd.shuffle(Category20_20)
for i in range(len(variable_list)):
sources += [make_dataset(value_list=[variable_list[i]], value=variable)]
glyph = graph(sources[i], colour=Category20_20[i])
args += [('glyph'+str(i),glyph)]
code += "glyph{}.visible = active.includes({});".format(i,i)
def update_bw(bin_wd):
for i in range(len(variable_list)):
new_src = make_dataset(value_list=[variable_list[i]], bin_width = bin_wd, value=variable)
sources[i].data = new_src.data
push_notebook(handle=bokeh_handle)
bin_select = Slider(start = 10, end = 500,
step = 10, value = 50,
title = 'Width of Price Ranges ($K)',callback=callback1)
checkbox = CheckboxGroup(labels=[str(i) for i in variable_list],\
active=[i for i in range(len(variable_list))],\
callback = CustomJS(args={key:value for key,value in args},code=code))
bokeh_handle = show(row(p, column(bin_select, checkbox)), notebook_handle=True)
Need to use Bokeh server to use tabs and change variable
variable = 'floors'
p = hist_start(variable)
variable_list = np.sort(df[variable].unique()).tolist()
args = []
code = "active = cb_obj.active;"
sources = []
rd.shuffle(Category20_20)
for i in range(len(variable_list)):
sources += [make_dataset(value_list=[variable_list[i]], value=variable)]
glyph = graph(sources[i], colour=Category20_20[i])
args += [('glyph'+str(i),glyph)]
code += "glyph{}.visible = active.includes({});".format(i,i)
def update_bw(bin_wd):
for i in range(len(variable_list)):
new_src = make_dataset(value_list=[variable_list[i]], bin_width = bin_wd, value=variable)
sources[i].data = new_src.data
push_notebook(handle=bokeh_handle)
bin_select = Slider(start = 10, end = 500,
step = 10, value = 50,
title = 'Width of Price Ranges ($K)',callback=callback1)
checkbox = CheckboxGroup(labels=[str(i) for i in variable_list],\
active=[i for i in range(len(variable_list))],\
callback = CustomJS(args={key:value for key,value in args},code=code))
bokeh_handle = show(row(p, column(bin_select, checkbox)), notebook_handle=True)
drop_options=[
{'label': 'Bedrooms', 'value': 'bedrooms'},
{'label': 'Bathrooms', 'value': 'bathrooms'},
{'label': 'Views', 'value': 'view'},
{'label': 'Floors', 'value': 'floors'},
{'label': 'Condition', 'value': 'condition'},
{'label': 'Grade', 'value': 'grade'},
{'label': 'Waterfront', 'value': 'waterfront'},
{'label': 'Year Built', 'value': 'yr_built'},
{'label': 'Year Renovated', 'value': 'yr_renovated'}]
def median_price(variable):
lab = ''
for t in range(len(drop_options)):
if drop_options[t]['value'] == variable:
lab = drop_options[t]['label']
if variable == 'yr_renovated':
labels = np.sort(eval('df.'+variable+'.unique()[1:]'))
else:
labels = np.sort(eval('df.'+variable+'.unique()'))
values = labels.astype(float)
for i in range(values.shape[0]):
values[i] = int(np.median(df['price'][df[variable]==labels[i]])/1000)
xaxis_title = 'Number of '+ lab.title()
return([str(l) for l in labels],values.tolist(),xaxis_title)
def bar_chart_bokeh(variable):
rd.shuffle(Category20_20)
labs,vals,xtitle = median_price(variable)
p = bplt.figure(x_range=labs, plot_height=400, plot_width = 700, x_axis_label=xtitle, \
y_axis_label='Median Price ($K)', title="Median Price vs "+xtitle)
p.vbar(x=labs, top=vals, width=3.6/len(labs), color=Category20_20[0], line_color='black', \
hover_fill_alpha = 1.0, hover_fill_color = Category20_20[1])
p.xgrid.grid_line_color = None
p.y_range.start = 0
h = HoverTool(tooltips = [('Median Price ($k)', '@top'),(xtitle, '@x')])
p.add_tools(h)
style(p)
show(p)
bar_chart_bokeh('view')
bar_chart_bokeh('floors')
Bokeh, unlike Dash, is bad for switching between different plots. It requires a changing of data source. It is also difficult to use in Jupyter Notebook, where it needs javascript commands to operate. It is much easier to use in the bokeh server.
bins = [0,300000,500000,700000,8000000]
df['binned'] = pd.cut(df['price'], bins).astype(str).tolist()
subset = df[['lat','long','price','binned','price_names','month']]
subs={}
for t in range(df['binned'].unique().shape[0]):
subs["s"+str(t)+""]=subset[subset['binned']==np.sort(df['binned'].unique())[t]]
#GOOGLE_API_KEY = "**************************"
#output_file("gmap.html")
map_options = GMapOptions(lat=np.median(df['lat'])-0.05, lng=np.median(df['long']), map_type="roadmap", zoom=10)
# For GMaps to function, Google requires you obtain and enable an API key:
#
# https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:p = gmap(GOOGLE_API_KEY, map_options, title="King County")
p = gmap(GOOGLE_API_KEY, map_options, title="2014-15 Sold House Prices in King County")
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
legend_labels = ['< $300k','$300-500k','$500-700k','> $700k']
sources = []
for i in range(4):
col = colours[i]
x = subs['s'+str(i)]
sources.append(ColumnDataSource(x))
p.circle(x="long", y="lat", size=4, fill_color=col, line_color=col, fill_alpha=0.8, source=sources[i], legend=legend_labels[i])
h = HoverTool(tooltips=[('Price', '@price_names')])
p.add_tools(h)
# Styling
style(p)
p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)
map_options = GMapOptions(lat=np.median(df['lat'])-0.05, lng=np.median(df['long']), map_type="roadmap", zoom=10)
# For GMaps to function, Google requires you obtain and enable an API key:
#
# https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:p = gmap(GOOGLE_API_KEY, map_options, title="King County")
p = gmap(GOOGLE_API_KEY, map_options, title="King County House Prices in May 2014")
colours = ['rgb(255,165,0)','rgb(255,140,0)','rgb(255,0,0)','rgb(128,0,0)']
legend_labels = ['< $300k','$300-500k','$500-700k','> $700k']
sources = []
for i in range(4):
col = colours[i]
x = subs['s'+str(i)]
sources.append(ColumnDataSource(x))
p.circle(x="long", y="lat", size=4, fill_color=col, line_color=col, fill_alpha=0.8, source=sources[i], legend=legend_labels[i])
h = HoverTool(tooltips=[('Price', '@price_names')])
p.add_tools(h)
# Styling
style(p)
p.legend.location = "top_left"
p.legend.click_policy="hide"
Dash is definitely smoother with maps (zooming in and out). But Bohek allows you to hide plots easily.
def update_map(month):
for i in range(4):
col = colours[i]
x = subs['s'+str(i)][subs['s'+str(i)]['month'] == months[month]]
sources[i].data = ColumnDataSource(x).data
p.title.text = "King County House Prices in "+months[month][:4].title()+'20'+months[month][4:].title()
push_notebook(handle=map_handle)
callback_map = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
var kernel = IPython.notebook.kernel;
cmd = "update_map(" + cb_obj.value + ")";
kernel.execute(cmd, {}, {});
}
""")
month_slider = Slider(start = 0, end = 12,
step = 1, value = 0,
title = 'Months from May 2014',callback=callback_map)
callback_animate = CustomJS(code="""
var f = cb_obj.active;
var j = 0;
if(f == true){
mytimer = setInterval(replace_data, 500);
} else {
clearInterval(mytimer);
}
function replace_data() {
j++;
if(j>12){
j = 0;
}
var kernel = IPython.notebook.kernel;
cmd = "update_map(" + j + ")";
kernel.execute(cmd, {}, {});
}
""")
btn = Toggle(label="Play/Stop Animation of All Months", button_type="success",
active=False, callback=callback_animate)
map_handle = show(row(p, column(btn,month_slider)), notebook_handle=True)
def ridge(category, data, scale=300):
return list(zip([category]*len(data), scale*data))
def ridge_plot(variable):
cats = np.sort(df[variable].unique()).tolist()
labels = [str(i) for i in cats]
palette = [cc.rainbow[i*15*int(17/len(cats))] for i in range(len(cats))]
if variable == 'view':
ylabel = 'Number of '+variable.title()+'s'
else:
ylabel = 'Number of '+variable.title()
x = np.linspace(0,1500, 500)
source = ColumnDataSource(data=dict(x=x))
p = bplt.figure(y_range=labels, plot_width=900, x_range=(0, 1500), toolbar_location=None, title="Ridge Plot for "+variable.title(),
x_axis_label = 'Price ($K)', y_axis_label = ylabel)
for i, cat in enumerate(reversed(labels)):
pdf = gaussian_kde((df['price'][df[variable]==cats[len(labels)-1-i]]/1000).astype(int))
y = ridge(cat, pdf(x))
source.add(y, cat)
p.patch('x', cat, color=palette[i], alpha=0.6, line_color="black", source=source)
p.outline_line_color = None
p.background_fill_color = "#efefef"
p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = "#dddddd"
p.xgrid.ticker = p.xaxis[0].ticker
p.axis.minor_tick_line_color = None
p.axis.major_tick_line_color = None
p.axis.axis_line_color = None
p.y_range.range_padding = 0.12
style(p)
show(p)
ridge_plot('view')
ridge_plot('floors')
monthly_median_prices = []
monthly_volume = []
for m in range(len(months)):
temp_df = df[df['month']==months[m]]
monthly_median_prices.append(int(np.median(temp_df['price'])))
monthly_volume.append(temp_df.shape[0])
source_pd = pd.DataFrame(
{'months': months,
'monthly_median_prices': [i/1000 for i in monthly_median_prices],
'monthly_volume': monthly_volume
})
source = ColumnDataSource(source_pd)
p = bplt.figure(plot_width=900, x_range = months,y_range=(min(monthly_median_prices)*0.000995,max(monthly_median_prices)*0.00105),\
title="2014-15 King County Real Estate Market Activity",\
x_axis_label = 'Month', y_axis_label = "Median Price ($K)")
p.line('months', 'monthly_median_prices', line_width=2, color='blue', legend="Median Price Sold", source=source)
p.extra_y_ranges = {"second_axis": Range1d(start=min(monthly_volume)*0.95, end=max(monthly_volume)*1.05)}
p.add_layout(LinearAxis(y_range_name="second_axis", axis_label="Number of Houses Sold"), 'right')
p.line('months', 'monthly_volume', line_width=2, y_range_name="second_axis", color='red', legend="Number of Houses Sold", source=source)
h = HoverTool(tooltips=[('Month', '@months'),('Median Price ($K)', '@monthly_median_prices'),('Number of Houses Sold', '@monthly_volume')])
p.add_tools(h)
p.legend.location = "bottom_left"
p.legend.click_policy="hide"
style(p)
show(p)
In Dash it automatically sets the hover tool.
bins = [100000*i for i in range(0,11)]+[8000000]
df['binned2'] = pd.cut(df['price'], bins).astype(str).tolist()
bins = ['(0, 100000]', '(100000, 200000]', '(200000, 300000]', '(300000, 400000]',\
'(400000, 500000]', '(500000, 600000]', '(600000, 700000]',\
'(700000, 800000]', '(800000, 900000]', '(900000, 1000000]', '(1000000, 8000000]',]
palette = [cc.rainbow[(len(bins)-1-i)*15*int(17/(len(bins)))] for i in range(len(bins))]
df['color'] = palette[0]
col_conv = dict(zip(bins,palette))
min_price = min(df['price'])
max_price = max(df['price'])
sizes = [5*i+5 for i in range(1,len(bins)+1)]
df['size'] = sizes[0]
size_conv = dict(zip(bins,sizes))
for i in df.index.values:
df.loc[i,'color'] = col_conv[df.loc[i,'binned2']]
df.loc[i,'size'] = size_conv[df.loc[i,'binned2']]
scatter_df = df[['price','bedrooms','sqft_living15','max_yr_built_or_renovated','color', 'size', 'price_names']][(df['yr_renovated']>0)\
&(df['bedrooms']<33)]
temp_df = scatter_df[scatter_df['max_yr_built_or_renovated']==2010]
source = ColumnDataSource(temp_df)
p = bplt.figure(plot_width=900, plot_height=350, title="Prices of Houses Built or Last Renovated in 2010",\
x_axis_label = 'Living Space', y_axis_label = "Number of Bedrooms",\
y_range = (0,max(scatter_df['bedrooms'])*1.05),\
x_range = (min(scatter_df['sqft_living15'])*0.95,max(scatter_df['sqft_living15'])*1.05))
p.scatter(x='sqft_living15', y='bedrooms', color='color', size='size', source=source, alpha=0.8)
style(p)
h = HoverTool(tooltips=[('Price', '@price_names'),('Number of Bedrooms', '@bedrooms'),('Size of Living Space', '@sqft_living15')])
p.add_tools(h)
callback_scatter = CustomJS(code="""
if (IPython.notebook.kernel !== undefined) {
var kernel = IPython.notebook.kernel;
cmd = "update_scatter(" + cb_obj.value + ")";
kernel.execute(cmd, {}, {});
}
""")
def update_scatter(year):
temp_df = scatter_df[scatter_df['max_yr_built_or_renovated']==year]
source.data = ColumnDataSource(temp_df).data
p.title.text = "Prices of Houses Built or Last Renovated in " + str(year)
push_notebook(handle=scatter_handle)
year_select = Slider(start = min(scatter_df['max_yr_built_or_renovated']), end = max(scatter_df['max_yr_built_or_renovated']),
step = 1, value = 2010,
title = 'Year',callback=callback_scatter)
callback_animate = CustomJS(code="""
var f = cb_obj.active;
var j = 1965;
if(f == true){
mytimer = setInterval(replace_data, 250);
} else {
clearInterval(mytimer);
}
function replace_data() {
j++;
if(j>2015){
j = 1965;
}
var kernel = IPython.notebook.kernel;
cmd = "update_scatter(" + j + ")";
kernel.execute(cmd, {}, {});
}
""")
btn = Toggle(label="Play/Stop Animation (from 1965)", button_type="success",
active=False, callback=callback_animate)
scatter_handle = show(column(btn,year_select,p), notebook_handle=True)
df['year'] = pd.to_numeric(df.date.str.slice(0, 4))
df['month'] = pd.to_numeric(df.date.str.slice(4, 6))
df['day'] = pd.to_numeric(df.date.str.slice(6, 8))
cols = ['year','month','day','yr_renovated',
'bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
'condition','grade','sqft_above','sqft_basement','yr_built',
'zipcode','lat','long','sqft_living15','sqft_lot15','price']
model_data = pd.DataFrame(df, columns=cols)
# no id, date, year built or year renovated
Zipcode, Long and Lat might not be good to treat as numbers linear
65%, 80-65=15%, 100-65-15=20% split
train, validate, test = np.split(model_data.sample(frac=1), [int(.65*len(model_data)), int(.80*len(model_data))])
def exclude_price(df):
return(df.columns[~df.columns.isin(['price'])])
wo_price = exclude_price(train)
x_train, x_validate, x_test = train[wo_price], validate[wo_price], test[wo_price]
y_train, y_validate, y_test = train['price'], validate['price'], test['price']
print('Number of columns in data: ', x_train.shape[1]-1)
print('Size of training set: ', x_train.shape[0])
print('Size of validation set: ', x_validate.shape[0])
print('Size of test set: ', test.shape[0])
print("Number of rows with nulls is %s" % train[train.isnull().any(axis=1)].shape[0])
Could Normalise using MinMax
min_max_scaler = MinMaxScaler()
x_train_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_train.values))
x_validate_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_validate.values))
x_test_mnmx = pd.DataFrame(min_max_scaler.fit_transform(x_test.values))
corr_df = pd.DataFrame(min_max_scaler.fit_transform(model_data.values))
y_train_mnmx = pd.DataFrame(min_max_scaler.fit_transform(y_train.values.reshape(-1, 1)))
y_validate_mnmx = pd.DataFrame(min_max_scaler.fit_transform(y_validate.values.reshape(-1, 1)))
Or Could Use Mean and Std Dev to Turn Into Z-Score
means = x_train.append(x_validate, ignore_index=True).mean(axis=0)
stds = x_train.append(x_validate, ignore_index=True).std(axis=0)
x_train_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
x_train_zscore[c] = (x_train[c]-means[c])/stds[c]
x_validate_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
x_validate_zscore[c] = (x_validate[c]-means[c])/stds[c]
ymeans = y_train.append(y_validate, ignore_index=True).mean(axis=0)
ystds = y_train.append(y_validate, ignore_index=True).std(axis=0)
y_train_zscore = pd.DataFrame()
y_train_zscore = (y_train-ymeans)/ystds
y_validate_zscore = pd.DataFrame()
y_validate_zscore = (y_validate-ymeans)/ystds
f, ax = plt.subplots(figsize=(10, 8))
corr = pd.DataFrame(corr_df).corr()
corr.columns = cols
corr.index = cols
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax)
corr.style.background_gradient().set_precision(2)
By correlation, which measures linear relationships, a parsimonious dataset would contain bedrooms, bathrooms, sqft_lving, grade, condition.
small_data_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'grade', 'condition']
sdindex = [cols.index(i) for i in small_data_cols]
small_x_train, small_x_validate, small_x_test = x_train_mnmx[sdindex], x_validate_mnmx[sdindex], x_test_mnmx[sdindex]
Clearly year, month and day are not correlated with the others. But the area ones are. Nothing special though.
def model1(x_size, y_size):
model = Sequential()
model.add(Dense(50, activation="tanh", input_shape=(x_size,)))
model.add(Dense(25, activation="relu"))
model.add(Dense(y_size))
print(model.summary())
model.compile(loss='mean_squared_error',optimizer=RMSprop(),metrics=[metrics.mae])
return(model)
Use small dataset for model 2
def model2(x_size, y_size):
model = Sequential()
model.add(Dense(100, activation="relu", input_shape=(x_size,)))
BatchNormalization(),
model.add(Dropout(0.1))
model.add(Dense(50, activation="relu"))
BatchNormalization(),
model.add(Dense(20, activation="sigmoid"))
BatchNormalization(),
model.add(Dense(y_size))
print(model.summary())
model.compile(loss='mean_absolute_error',optimizer=Adam(),metrics=[metrics.mae])
return(model)
def model3(x_size, y_size):
model = Sequential()
model.add(Dense(80, activation="tanh", kernel_initializer='normal', input_shape=(x_size,)))
model.add(Dropout(0.2))
model.add(Dense(120, activation="relu", kernel_initializer='normal',
kernel_regularizer=regularizers.l1(0.01), bias_regularizer=regularizers.l1(0.01)))
model.add(Dropout(0.1))
model.add(Dense(20, activation="relu", kernel_initializer='normal',
kernel_regularizer=regularizers.l1_l2(0.01), bias_regularizer=regularizers.l1_l2(0.01)))
model.add(Dropout(0.1))
model.add(Dense(10, activation="relu", kernel_initializer='normal'))
model.add(Dense(y_size))
model.compile(loss='mean_squared_error',optimizer='nadam',metrics=[metrics.mae])
return(model)
Now we create the model - use one of the above functions.
m1 = model1(x_train.shape[1], 1)
m1.summary()
m2 = model2(small_x_train.shape[1], 1)
m2.summary()
m3 = model3(x_train.shape[1], 1)
m3.summary()
m4 = model3(x_train.shape[1], 1)
m4.summary()
epochs = 500
batch_size = 128
print('Epochs: ', epochs)
print('Batch size: ', batch_size)
history1 = m1.fit(x_train_mnmx.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
validation_data=(x_validate_mnmx.values, y_validate.values),
callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
history2 = m2.fit(small_x_train.values, y_train.values, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=0,
validation_data=(small_x_validate.values, y_validate.values),
callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
history3 = m3.fit(x_train_mnmx.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
validation_data=(x_validate_mnmx.values, y_validate.values),
callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
history4 = m4.fit(x_train_zscore.values, y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
validation_data=(x_validate_zscore.values, y_validate.values),
callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=20, verbose=0)])
training_score = m1.evaluate(x_train_mnmx, y_train, verbose=0)
validation_score = m1.evaluate(x_validate_mnmx, y_validate, verbose=0)
print('Model 1')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
%(int(round(training_score[1]/1000, 0)),int(round(training_score[0]/1000000000, 0)),
int(round(validation_score[1]/1000, 0)),int(round(validation_score[0]/1000000000, 0))))
training_score = m2.evaluate(small_x_train, y_train, verbose=0)
validation_score = m2.evaluate(small_x_validate, y_validate, verbose=0)
print('Model 2')
print('Training MAE: $%sk\nTraining (Squared) Loss: >= $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: >= $%sbn\n'\
%(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)),
int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
training_score = m3.evaluate(x_train_mnmx, y_train, verbose=0)
validation_score = m3.evaluate(x_validate_mnmx, y_validate, verbose=0)
print('Model 3')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
%(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)),
int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
training_score = m4.evaluate(x_train_zscore, y_train, verbose=0)
validation_score = m4.evaluate(x_validate_zscore, y_validate, verbose=0)
print('Model 4')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
%(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)),
int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
p = bplt.figure(plot_width=900,title="Model 4 Performance",
x_axis_label = 'Epoch', y_axis_label = "Mean Absolute Error ($k)")
perf = pd.DataFrame(
{'training': [i/1000 for i in history4.history['mean_absolute_error']],
'validation': [i/1000 for i in history4.history['val_mean_absolute_error']],
'epoch': [i for i in range(len(history4.history['val_mean_absolute_error']))],
})
source_perf = ColumnDataSource(perf)
p.line('epoch','training',line_width=2, color='purple', legend="Training", source=source_perf)
p.line('epoch','validation',line_width=2, color='pink', legend="Validation", source=source_perf)
h = HoverTool(tooltips=[('Train MAE ($k)', '@training'),('Validation MAE ($k)', '@validation'),('Epoch', '@epoch')])
p.add_tools(h)
p.legend.location = "top_right"
p.legend.click_policy="hide"
style(p)
show(p)
Train Model 4 again with both the training set and the validation set. Then use compare its predictions on the test set with the actual values.
test_means = x_test.mean(axis=0)
test_stds = x_test.std(axis=0)
x_test_zscore = pd.DataFrame()
for c in cols[:(len(cols)-1)]:
x_test_zscore[c] = (x_test[c]-test_means[c])/test_stds[c]
m5 = model3(x_train.shape[1], 1)
new_x_train = x_train_zscore.append(x_validate_zscore, ignore_index=True)
new_y_train = y_train.append(y_validate, ignore_index=True)
history5 = m5.fit(new_x_train.values, new_y_train.values,batch_size=batch_size,epochs=epochs,shuffle=True,verbose=0,
validation_data=(x_test_zscore.values, y_test.values),
callbacks=[EarlyStopping(monitor='val_mean_absolute_error', patience=40, verbose=0)])
p = bplt.figure(plot_width=900,title="Model 4 Performance Using Validation and Training Data",
x_axis_label = 'Epoch', y_axis_label = "Mean Absolute Error ($k)")
perf2 = pd.DataFrame(
{'training': [i/1000 for i in history5.history['mean_absolute_error']],
'validation': [i/1000 for i in history5.history['val_mean_absolute_error']],
'epoch': [i for i in range(len(history5.history['val_mean_absolute_error']))],
})
source_perf2 = ColumnDataSource(perf2)
p.line('epoch','training',line_width=2, color='purple', legend="Training", source=source_perf2)
p.line('epoch','validation',line_width=2, color='blue', legend="Test", source=source_perf2)
h = HoverTool(tooltips=[('Train MAE ($k)', '@training'),('Test MAE ($k)', '@validation'),('Epoch', '@epoch')])
p.add_tools(h)
p.legend.location = "top_right"
p.legend.click_policy="hide"
style(p)
show(p)
training_score = m5.evaluate(new_x_train, new_y_train, verbose=0)
validation_score = m5.evaluate(x_test_zscore, y_test, verbose=0)
print('Model 4 Trained on Training and Validation Datasets')
print('Training MAE: $%sk\nTraining (Squared) Loss: $%sbn\nValidation MAE: $%sk\nValidation (Squared) Loss: $%sbn\n'\
%(int(round(training_score[1]/1000, 0)),int(round(pow(training_score[0],2)/1000000000, 0)),
int(round(validation_score[1]/1000, 0)),int(round(pow(validation_score[0],2)/1000000000, 0))))
test_y_predictions = m5.predict(x_test_zscore)
pred_pd = pd.DataFrame(
{'predicted': [log(i[0]/1000) for i in test_y_predictions.tolist()],
'actual': [log(i/1000) for i in y_test],
'predicted_nl': [int(i[0]/1000) for i in test_y_predictions.tolist()],
'actual_nl': [int(i/1000) for i in y_test]
})
mn = min(min(pred_pd['predicted']),min(pred_pd['actual']))
mx = max(max(pred_pd['predicted']),max(pred_pd['actual']))
pred_line = pd.DataFrame(
{'x': [mn,mx],
'y': [mn,mx]
})
source_pred = ColumnDataSource(pred_pd)
source_line = ColumnDataSource(pred_line)
p = bplt.figure(plot_width=900,title="Actual House Prices vs Predicted House Prices",\
x_axis_label = "Log of Predicted", y_axis_label = 'Log of Actual')
p.scatter(x='predicted', y='actual', source=source_pred)
h = HoverTool(tooltips=[('Actual Price ($k)', '@actual_nl'),('Predicted Price ($k)', '@predicted_nl')])
p.add_tools(h)
p.line(x='x',y='y',line_width=1, line_dash='dashed', color='red', source=source_line)
p.legend.location = "bottom_right"
style(p)
show(p)
with open('model4_predictions.pckl', 'wb') as handle:
pickle.dump(test_y_predictions, handle)
handle.close()
with open('model4_actual.pckl', 'wb') as handle:
pickle.dump(y_test, handle)
handle.close()
with open('model4_actual_extended.pckl', 'wb') as handle:
pickle.dump(test, handle)
handle.close()